rm(list = ls())
options(warn=-1)
#-------------------------------------------------------------------------------------
setwd('~/Dropbox/Research/compliance_blocking/replication/GOTV')
library(logr)
lf<-log_open("06_model_performance.log")
load('Data/generated/gotv_cleaned.Rdata')
library(tidyverse)
library(randomForest)
library(ROCR)
#-------------------------------------------------------------------------------------
log_print(sessionInfo())
#-------------------------------------------------------------------------------------
df_all = df
cities = unique(df_all$city)
#-------------------------------------------------------------------------------------
set.seed(1)
compliance_models<-list()
for(city in cities){
    df_subset = df_all[which(df_all$city == city),]
    X = df_subset[which(df_subset$T==1),] %>%
                dplyr::select(-c(T, C, Y, age18, age35, age50, age65, missing_primary, missing99))
    if(!(city %in% c("Bridgeport", "Raleigh"))){
        X<-X %>% dplyr::select(-c(party))
    }
    if(city != "Raleigh"){
        X<-X %>% dplyr::select(-c("race"))
    }
    if(city=='Bridgeport'){
        X<-X %>% dplyr::select(-c(voted99))
    }
    if(city=='Raleigh'){
        X<-X %>% dplyr::select(-c(voted00))
    }
    if(city=="Columbus"){
        X<-X %>% dplyr::select(-c(age))
    }
    C = as.factor(df_subset$C[which(df_subset$T==1)])
    model_data = data.frame(C, X)
    model_compliance = randomForest(C~., data = model_data)
    compliance_models[[city]]<-model_compliance
}
results = c()
for(city in cities){
    df_subset = df_all[which(df_all$city == city),]
    scores = predict(compliance_models[[city]], df_subset[df_subset$T==1,], type='prob')[,2]
    results = rbind(results, 
        data.frame(city= city, scores, C = df_subset$C[df_subset$T==1]))
}


results$Chat = ifelse(results$scores > 0.5, 1, 0)
#-------------------------------------------------------------------------------------
results_full = results %>% group_by(city) %>%
summarize(
    accuracy=mean(Chat == C)*100,
    AUC=performance(prediction(scores, C), measure = "auc")@y.values[[1]])
#-------------------------------------------------------------------------------------
#Restricted set:
set.seed(1)
compliance_models<-list()
for(city in cities){
    df_subset = df_all[which(df_all$city == city),]
    X = df_subset[which(df_subset$T==1),] %>%
                dplyr::select(-c(T, C, Y, age18, age35, age50, age65, missing_primary, missing99))
    if(city=="Bridgeport"){
        #blocking_set2 = c("voted00", "primary")
        X = X%>% dplyr::select(voted00, age)
    }
    if(city == "Raleigh"){
        X = X %>% dplyr::select(primary, turf)
    }
    if(city == "Minneapolis"){
        #blocking_set2 = c("famsize", "primary")
        X = X %>% dplyr::select(primary, age2)
    }
    if(city == "Detroit"){
        #blocking_set2 = c("voted99", "primary")
        X = X %>% dplyr::select(voted99, primary)
    }
    if(city == "Columbus"){
        X = X %>% dplyr::select(voted99, primary)
        #blocking_set2 = c("")
    }
    if(city == "St. Paul"){
        #blocking_set2 = c("voted99", "primary")
        X = X %>% dplyr::select(age, primary)
    }
    C = as.factor(df_subset$C[which(df_subset$T==1)])
    model_data = data.frame(C, X)
    model_compliance = randomForest(C~., data = model_data)
    compliance_models[[city]]<-model_compliance
}

results = c()
for(city in cities){
    df_subset = df_all[which(df_all$city == city),]
    scores = predict(compliance_models[[city]], df_subset[df_subset$T==1,], type='prob')[,2]
    results = rbind(results,
        data.frame(city= city, scores, C = df_subset$C[df_subset$T==1]))
}

results$Chat = ifelse(results$scores > 0.5, 1, 0)

results_restricted = results %>% group_by(city) %>%
summarize(
    accuracy=mean(Chat == C)*100,
    AUC=performance(prediction(scores, C), measure = "auc")@y.values[[1]]
)

#Table A-4.4
log_print(xtable::xtable(cbind(results_full, results_restricted[,-1]), digits=c(NA, NA, 0,2,0,2)), include.rownames=FALSE)

log_close()